0.0.1 Setup
Criteria to exclude students who most likely took a scored exam:
Any PhD students (n = 2)
Any 5th year program students (n = 19)
M4 students at Vanderbilt (n = 5)
Students who did not complete either Step survey (n = 2)
Students who specifically stated they took a scored Step 1 (n=1)
Based on our criteria we would exclude record IDs:
VUSM: 23, 26, 39, 40, 54, 3, 8, 12, 49, 60, 62, 64
HMS: 1, 21, 28, 30, 34, 37, 39, 41, 44, 47, 49, 61
UVA: 33, 47, 80, 81, 83
#the list of IDs we decided as a group to exclude
excludeVU <- c(23, 26, 39, 40, 54, 3, 8, 12, 49, 60, 62, 64)
excludeHMS <- c(1, 21, 28, 30, 34, 37, 39, 41, 44, 47, 49, 61)
excludeUVA <- c(33, 47, 80, 81, 83)
'%!in%' <- function(x,y)!('%in%'(x,y)) #make a way to use the not in command
VU_in <- filter(VUMSdat, record_id %!in% excludeVU)
H_in <- filter(HMSdat, record_id %!in% excludeHMS)
UVA_in <- filter(UVAdat, record_id %!in% excludeUVA)
#now I want to select the columns I'd like to include for all of my analysis (so they're in the proper order for a cbind). this will be relatively easy to come back to edit later, if needed.
#first, remember to include a school identifier
VU_in[,"schoolid"] <- "VU"
UVA_in[,"schoolid"] <- "UVa"
H_in[,"schoolid"] <- "HMS"
############ plan for how I will get the data in a format I want:
# - pull relevant columns by "starts with"
# - confirm all column names match, then
# - rbind together once
# - then I can select from this sheet the questions relevant to Step 1 first with ends with "_1", and those who took Step 1 second with a "_2"
############ Now pulling the common columns we're interested in as predictors and outcomes
##note, for VU I removed ""number_other_courses_step1_1" and starts_with("other_courses_step1_1___1) because these questions were not on the other school surveys
took_step1_VU <-
VU_in %>% select(
starts_with("uworld_percent_step1"),
starts_with("amboss_percent_step1"),
starts_with("length_step1"),
starts_with("practicetest_step1"),
starts_with("full_test_practice_step1"),
starts_with("push_step1"),
starts_with("push_practice_test_step1"),
starts_with("push_nbme_practice_score_step1"),
starts_with("push_uw_practice_score_step1"),
starts_with("final_nbme_practice_score_step1"),
starts_with("final_uw_practice_score_step1"),
starts_with("score_step1"),
"schoolid"
)
took_step1_UVA <-
UVA_in %>% select(
starts_with("uworld_percent_step1"),
starts_with("amboss_percent_step1"),
starts_with("length_step1"),
starts_with("practicetest_step1"),
starts_with("full_test_practice_step1"),
starts_with("push_step1"),
starts_with("push_practice_test_step1"),
starts_with("push_nbme_practice_score_step1"),
starts_with("push_uw_practice_score_step1"),
starts_with("final_nbme_practice_score_step1"),
starts_with("final_uw_practice_score_step1"),
starts_with("score_step1"),
"schoolid"
)
took_step1_H <-
H_in %>% select(
starts_with("uworld_percent_step1"),
starts_with("amboss_percent_step1"),
starts_with("length_step1"),
starts_with("practicetest_step1"),
starts_with("full_test_practice_step1"),
starts_with("push_step1"),
starts_with("push_practice_test_step1"),
starts_with("push_nbme_practice_score_step1"),
starts_with("push_uw_practice_score_step1"),
starts_with("final_nbme_practice_score_step1"),
starts_with("final_uw_practice_score_step1"),
starts_with("score_step1"),
"schoolid"
)
## now rbinding the three schools together
took_step1general <- rbind(took_step1_H, took_step1_UVA, took_step1_VU)
## splitting the dataset so I also have reference sheets specific to step 1 first and step 1 second
took_step1_first <- took_step1general %>% select(ends_with("_1"), "schoolid")
took_step1_second <- took_step1general %>% select(ends_with("_2"), "schoolid")
# profiling missing data (must discuss!!)
visdat::vis_miss(took_step1general) visdat::vis_miss(took_step1_first) #looks like a school specific trend with who is answering what question...visdat::vis_miss(took_step1_second) #looks like a school specific trend with who is answering what question...#these are the variables I can actually analyze
describe(VU_in$score_step2_2) VU_in$score_step2_2
n missing distinct
28 25 19
Value 239 243 244 252 254 255 256 258 260 261 263
Frequency 1 1 1 1 2 2 1 2 2 2 1
Proportion 0.036 0.036 0.036 0.036 0.071 0.071 0.036 0.071 0.071 0.071 0.036
Value 264 265 267 268 270 271 273 275
Frequency 3 1 3 1 1 1 1 1
Proportion 0.107 0.036 0.107 0.036 0.036 0.036 0.036 0.036
describe(VU_in$score_step2_1)VU_in$score_step2_1
n missing distinct
7 46 6
Value 255 264 266 271 272 Nah
Frequency 1 1 1 1 2 1
Proportion 0.143 0.143 0.143 0.143 0.286 0.143
took_step2_VU <-
VU_in %>% select(
starts_with("uworld_percent_step2"),
starts_with("amboss_percent_step2"),
starts_with("length_step2"),
starts_with("practicetest_step2"),
starts_with("full_test_practice_step2"),
starts_with("practice_score_step2"),
starts_with("score_step2"),
starts_with("target_score_step2"),
"schoolid"
) #note, I removed ""number_other_courses_step1_1" and starts_with("other_courses_step1_1___1) because these questions were not on the other school surveys
took_step2_UVA <-
UVA_in %>% select(
starts_with("uworld_percent_step2"),
starts_with("amboss_percent_step2"),
starts_with("length_step2"),
starts_with("practicetest_step2"),
starts_with("full_test_practice_step2"),
starts_with("practice_score_step2"),
starts_with("score_step2"),
starts_with("target_score_step2"),
"schoolid"
)
took_step2_H <-
H_in %>% select(
starts_with("uworld_percent_step2"),
starts_with("amboss_percent_step2"),
starts_with("length_step2"),
starts_with("practicetest_step2"),
starts_with("full_test_practice_step2"),
starts_with("practice_score_step2"),
starts_with("score_step2"),
starts_with("target_score_step2"),
"schoolid"
)
took_step2general <- rbind(took_step2_H, took_step2_UVA, took_step2_VU)
took_step2_first <- took_step2general %>% select(ends_with("_1"), "schoolid")
took_step2_second <- took_step2general %>% select(ends_with("_2"), "schoolid")
visdat::vis_miss(took_step2_first) #looks like a school specific trend with who is answering what question...visdat::vis_miss(took_step2_second) #looks like a school specific trend with who is answering what question...#this function is so I can customize the panels in the correlation pairs plots.
tf = function(x,y) {
ct <- cor.test(x, y)
sprintf("Corr: %0.2f, p: %0.3f\n95%% CI: (%0.2f - %0.2f) \n n pairs = %.0f",
ct$estimate, ct$p.value, ct$conf.int[1], ct$conf.int[2], sum(!is.na(x), !is.na(y))/2)
} #https://stackoverflow.com/questions/75068871/add-p-value-and-or-95-ci-in-the-text-of-a-correlation-matrix-pairs-plot
#I made a pairs plot but clearly some variable classes need to be changed
class(took_step2_second$practice_score_step2_2) <- "integer"
class(took_step2_second$score_step2_2) <- "integer"
ggpairs(took_step2_second, title="Pairs plots", upper=list(continuous=wrap("statistic",text_fn=tf,title=NULL, sep=NULL)), progress = FALSE) #well this means it's time to take a breakstr(took_step2general)'data.frame': 182 obs. of 17 variables:
$ uworld_percent_step2_1 : int 90 NA NA NA NA NA NA NA NA NA ...
$ uworld_percent_step2_2 : int NA 100 74 60 100 95 NA NA 100 90 ...
$ amboss_percent_step2_1 : int NA NA NA NA NA NA NA NA NA NA ...
$ amboss_percent_step2_2 : int NA NA 15 NA 5 NA NA NA NA NA ...
$ length_step2_1 : int 5 NA NA NA NA NA NA NA NA NA ...
$ length_step2_2 : int NA 4 3 4 5 4 NA NA 4 4 ...
$ practicetest_step2_1 : chr "5" NA NA NA ...
$ practicetest_step2_2 : chr NA "4" "4" "7" ...
$ full_test_practice_step2_1: int 1 NA NA NA NA NA NA NA NA NA ...
$ full_test_practice_step2_2: int NA 4 1 4 4 4 NA NA 3 4 ...
$ practice_score_step2_1 : int 266 NA NA NA NA NA NA NA NA NA ...
$ practice_score_step2_2 : chr NA "251" "260" NA ...
$ score_step2_1 : chr "264" NA NA NA ...
$ score_step2_2 : chr NA "262" "264" "269" ...
$ target_score_step2_1 : int 1 NA NA NA NA NA NA NA NA NA ...
$ target_score_step2_2 : int NA 2 2 2 1 2 NA NA 1 2 ...
$ schoolid : chr "HMS" "HMS" "HMS" "HMS" ...
0.1 what about profiling the data we have for if they pushed back an exam?
It doesn’t look like there is a question related to pushing back step 2, just step 1. Is that okay?
describe(took_step1general$push_step1_1)took_step1general$push_step1_1
n missing distinct Info Mean Gmd
88 94 2 0.302 1.886 0.2038
Value 1 2
Frequency 10 78
Proportion 0.114 0.886
describe(took_step1general$push_step1_2)took_step1general$push_step1_2
n missing distinct Info Mean Gmd
66 116 2 0.386 1.848 0.2611
Value 1 2
Frequency 10 56
Proportion 0.152 0.848
1 data cleaning note (discuss as a group if this is a useful data cleaning path anyways)
this was another tactic I was trying but frustratingly, the class of different columns was changed between surveys. So far, I have abandoned this technique in favor of the above, where I am affirmativly selecting the columns (but I still may run into the different class issue)
#try dropping problematic columns and then do bind_rows
# removeextracolsH <-
# H_in %>% select(
# -c(
# "vusmhms_usmle_trends_survey_timestamp",
# "vusmhms_usmle_trends_survey_complete"
# ),
# contains("number_other_courses")
# )
# setdiff(names(H_in), names(UVA_in))
# setdiff(names(removeextracolsH), names(UVA_in))
#
# removeextracolsUVA <-
# UVA_in %>% select(
# -contains("number_other_courses"),
# -c(
# "vusmuva_usmle_trends_survey_timestamp",
# "vusmuva_usmle_trends_survey_complete"
# )
# )
# setdiff(names(removeextracolsUVA), names(removeextracolsH)) #confirming the cols match now
#
# removeextracolsVU <- VU_in %>% select(-contains("number_other_courses"), -contains("other_courses"))
# setdiff(names(removeextracolsUVA), names(removeextracolsVU)) #confirming the cols match now
#
# class(removeextracolsH$practicetest_step1_1) <- "integer" #this column was coded as a character so it wouldn't merge previously
# bind_rows(list(removeextracolsVU, removeextracolsH), .id = "id")